We will explore the data set from the Upsalla Data Conflict Program.
To start, download and clean the data. Then some light analysis and plotting. Finally, we will make a prediction using the insights we’ve gained.
This is the nitty gritty of getting the data and putting it into a usable form. An important step, but a tedious one to read through. Feel free to skip ahead.
# get data
prac <- fromJSON('https://ucdpapi.pcr.uu.se/api/gedevents/20.1?pagesize=1000&StartDate=2019-01-01&EndDate=2019-12-31')
# extract results list
result <- prac$Result
# Null to NA function to apply through
null_to_na <- function(x) {
for(i in 1:length(x))
if(is.null(x[[i]])){
x[[i]] <- NA
} else {
next
}
return(x)
}
# Set nulls to NA
result <- lapply(result, null_to_na)
# Initialize data frame with first element of results
c_df<- data.frame(result[[1]])
# Add the rest w/loop
for (i in 2:length(result)){
c_df <- rbind(c_df, data.frame(result[[i]]))
}
# get next URL
URL <- prac$NextPageUrl
url_list <- c(URL, rep(NA, 50))
# Get list of URL's
for( i in 2:39){
listing <- fromJSON(URL)
if (listing$NextPageUrl != ""){
url_list[i] <- listing$NextPageUrl
Sys.sleep(.2)
URL <- listing$NextPageUrl
} else {
break
}
}
# Manually subset to remove NA's
url_list <- url_list[!is.na(url_list)]
# This function should take a vector of URL's that return JSON, and
# give back data frames of data
get_all_data <- function(x) {
# get Data
data <- fromJSON(x)
# subset Data
data <- data$Result
# Turn Nulls to NA's
data <- lapply(data, null_to_na)
# Initialize data frame with first element of results
c_df<- data.frame(data[[1]])
# Add the rest w/loop
for (i in 2:length(data)){
c_df <- rbind(c_df, data.frame(data[[i]]))
}
return(c_df)
}
# lappy over our URL with get_all_data
yes <- lapply(url_list, get_all_data)
# Collapse list of df's to single df
yes <- bind_rows(yes)
# Add page 1
c_df <- rbind(c_df, yes)
# As tibble
c_df <- c_df %>%
as_tibble(c_df)
# Create a civilian deaths categorical variable
c_df <- c_df %>%
mutate(civ_cat = case_when(deaths_civilians > 0 ~ 'yes', TRUE ~ 'no'))
# Save file for later use
#save(c_df, "data/conflict_19.Rds")
#load(file = "data/conflict_19.Rds")
Let’s look at some stuff.
Random Subset
set.seed(42)
kable(c_df %>%
filter(best > 10) %>%
sample_n(5) %>%
select(date_start, country, side_a, side_b, deaths_a, deaths_b, civ_cat) %>%
arrange(date_start), col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Civilian Causualties'), align = 'c', caption = 'Random Subset')
| Date | Conflict Location | Side A | Side B | Deaths A | Deaths B | Civilian Causualties |
|---|---|---|---|---|---|---|
| 2019-03-12T00:00:00 | Nigeria | Government of Nigeria | IS | 0 | 22 | no |
| 2019-05-14T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 17 | no |
| 2019-09-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Santa Rosa de Lima Cartel | 0 | 0 | no |
| 2019-09-19T00:00:00 | DR Congo (Zaire) | Government of DR Congo (Zaire) | CMC | 23 | 0 | no |
| 2019-10-03T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 24 | no |
Most Events By Country
kable(c_df %>%
add_count(country) %>%
group_by(country) %>%
summarise(country = country, total_estimated = sum(best), n = n) %>%
distinct() %>%
select(country, n, total_estimated) %>%
arrange(desc(n)) %>%
head(10), col.names = c('Conflict Location', 'Total Events', 'Total Deaths'), align = 'c', caption = 'Most Deaths')
| Conflict Location | Total Events | Total Deaths |
|---|---|---|
| Afghanistan | 4682 | 30434 |
| Syria | 2242 | 10931 |
| Mexico | 786 | 11789 |
| Nigeria | 509 | 2437 |
| Somalia | 426 | 2221 |
| DR Congo (Zaire) | 416 | 2393 |
| India | 352 | 728 |
| Brazil | 263 | 1296 |
| Iraq | 246 | 803 |
| Cameroon | 239 | 858 |
Afghanistan had far and away the most conflicts, followed by Syria and Mexico.
Events with the Largest Death Counts
kable(c_df %>%
select(date_start, country, side_a, side_b, deaths_a, deaths_b, deaths_unknown, best, civ_cat) %>%
arrange(desc(best)) %>%
head(10),
col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Unknown Deaths', 'Best Estimate for Total Deaths', 'Civilian Causualties'),
align = 'c',
caption = 'Largest Death Count per Conflict Event in 2019')
| Date | Conflict Location | Side A | Side B | Deaths A | Deaths B | Unknown Deaths | Best Estimate for Total Deaths | Civilian Causualties |
|---|---|---|---|---|---|---|---|---|
| 2019-01-01T00:00:00 | Brazil | Comando Vermelho | GDE | 0 | 0 | 739 | 739 | no |
| 2019-10-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Santa Rosa de Lima Cartel | 0 | 0 | 278 | 278 | no |
| 2019-12-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Santa Rosa de Lima Cartel | 0 | 0 | 272 | 272 | no |
| 2019-12-01T00:00:00 | Mexico | Jalisco Cartel New Generation | La Familia | 0 | 0 | 266 | 266 | no |
| 2019-09-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Santa Rosa de Lima Cartel | 0 | 0 | 250 | 250 | no |
| 2019-11-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Santa Rosa de Lima Cartel | 0 | 0 | 245 | 245 | no |
| 2019-10-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Sinaloa Cartel | 0 | 0 | 231 | 231 | no |
| 2019-03-19T00:00:00 | Syria | IS | SDF | 20 | 0 | 165 | 230 | yes |
| 2019-07-01T00:00:00 | Mexico | Jalisco Cartel New Generation | La Familia | 0 | 0 | 229 | 229 | no |
| 2019-02-01T00:00:00 | Mexico | Jalisco Cartel New Generation | Santa Rosa de Lima Cartel | 0 | 0 | 226 | 226 | no |
We notice a few things from the table above. Mexico had many deadly conflict events in 2019, their dates are truncated to the first of the month, and nearly all deaths are classified as ‘unknown’. This leads me to believe that the deaths are not one single conflict event, but instead a collection of smaller events that are aggregated and reported at the end of the month.
USA Involved
# USA involved
kable(c_df %>%
filter(side_b_new_id == 769 | side_a_new_id == 769) %>%
select(date_start, country, side_a, side_b, deaths_a, deaths_b, civ_cat) %>%
arrange(date_start), col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Civilian Causualties'), align = 'c', caption = 'USA Directly Involved')
| Date | Conflict Location | Side A | Side B | Deaths A | Deaths B | Civilian Causualties |
|---|---|---|---|---|---|---|
| 2019-04-07T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 2 | no |
| 2019-05-21T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 2 | no |
| 2019-06-29T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 2 | no |
| 2019-06-29T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 1 | no |
| 2019-07-30T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 9 | no |
| 2019-07-30T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 4 | no |
| 2019-07-30T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 8 | no |
| 2019-09-07T00:00:00 | Afghanistan | Government of United States of America | al-Qaida | 0 | 9 | no |
Doesn’t appear to be a ton of direct US involvement in the 19 year old conflict with the Taliban.
# USA involved
kable(c_df %>%
filter(side_a_new_id == 130 | side_a_new_id == 130) %>%
select(date_start, country, side_a, side_b, deaths_a, deaths_b, civ_cat) %>%
arrange(desc(deaths_b)) %>%
head(10), col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Civilian Causualties'), align = 'c', caption = 'Afghanistan Military')
| Date | Conflict Location | Side A | Side B | Deaths A | Deaths B | Civilian Causualties |
|---|---|---|---|---|---|---|
| 2019-08-31T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 100 | no |
| 2019-04-06T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 24 | 99 | no |
| 2019-03-22T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 12 | 94 | yes |
| 2019-09-04T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 93 | no |
| 2019-03-26T00:00:00 | Afghanistan | Government of Afghanistan | IS | 0 | 87 | no |
| 2019-09-06T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 85 | no |
| 2019-09-14T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 84 | no |
| 2019-09-28T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 61 | no |
| 2019-11-24T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 8 | 60 | no |
| 2019-06-19T00:00:00 | Afghanistan | Government of Afghanistan | Taleban | 0 | 59 | no |
So it appears the US itself isn’t as active as the Afghanistan state military, however considering the US spends 38 billion (with a b) dollars in 2019 alone, well…
Civilian Deaths Globally
# How many?
kable(c_df %>%
count(civ_cat) %>%
arrange(n), caption = 'Civilian Deaths', col.names = c('Civilian Deaths', 'Number of Events'))
| Civilian Deaths | Number of Events |
|---|---|
| yes | 2795 |
| no | 9689 |
Average Deaths Per Conflict
# Okay, loving this plot, group by country and rounded dates, sum deaths, plot
# Filtering by more than 300 events here.
c_df %>%
add_count(country) %>%
filter(n > 300) %>%
mutate(rounded_date = floor_date(as.Date(date_start), unit = 'month')) %>%
group_by(country, rounded_date) %>%
mutate(sum_deaths = sum(best)) %>%
select(country, sum_deaths, rounded_date) %>%
ggplot(aes(rounded_date, sum_deaths, color = country)) +
geom_line() +
theme(axis.text.x = element_text(angle = 45, vjust = .5), legend.position = 'none') +
labs(x = 'Dates', y = 'Total Deaths', title = 'Deaths Per Month') +
facet_wrap(~country, scales = 'free_y')

# Okay, really cool plot. Doesn't show st dev, but shows average deaths, plus sample
# size, ordered by most
c_df %>%
select(country, best, date_start, region, best) %>%
add_count(country) %>%
filter(n > 100) %>%
group_by(country) %>%
mutate(average = (sum(best)/n), sum_deaths = sum(best)) %>%
select(-best, -date_start) %>%
# Seems like a cheater way to limit number of rows
summarise(region = unique(region),
n = max(n),
average = max(average),
sum_deaths = max(sum_deaths)) %>%
mutate(country = fct_reorder(country, -average)) %>%
ggplot(aes(country, average, fill = country))+
geom_col() +
geom_text(aes(label = paste0('n= ', n), angle = 90), hjust = 'top') +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = 'none') +
labs(x = 'Country', y = 'Average Deaths per Violent Encounter 2019') +
scale_y_continuous(n.breaks = 8)

Like we mention above, Mexico’s death counts per event are suspicious. There might be some aggregation going on.
Deaths per Conflict
# This is beautiful. Easy ggplot for histogram of deaths, w/ Region Facet
c_df %>%
filter(best < 50) %>%
ggplot(aes(x = best)) +
geom_histogram(bins = 50, aes(fill=region)) +
facet_wrap(~region) +
xlab('Deaths Best Estimate') +
labs(title = 'Are some regions events more deadly?', y ='Number of Events', x='Number of Deaths')

They appear to be the same.
# The World Map
world <- map_data('world')
#
ggplot() +
geom_map(data = world, map = world, aes(long, lat, map_id = region),
color = 'white', fill = 'gray50', alpha = .2) +
geom_point(data = c_df, aes(longitude, latitude, color = as.factor(type_of_violence)),
alpha = .8) +
theme(legend.title = element_text('Type of Violence')) +
labs(title = 'Global Violence') +
scale_color_brewer(palette = 'Set1', labels = c('State Based', 'Non-State', 'One-Sided')) +
guides(color = guide_legend('Type of Violence'))

# Okay, kinda like, civilian deaths mapped globally
ggplot() +
geom_map(data = world, map = world, aes(long, lat, map_id = region),
color = 'white', fill = 'gray50', alpha = .3) +
geom_point(data = c_df, aes(longitude, latitude,
color = as.factor(civ_cat),
group = id), alpha = .8) +
theme() +
scale_color_brewer(palette = 'Dark2') +
labs(title = 'Conflict Civilian Deaths 2019', x ='', y= '') +
guides(color = guide_legend('Civilian Deaths'))

Lets look at January 2019 for a few select countries.
# Set up data frame
# January only, Afghanistan only, change to dates, select specific columns
c_jan_19_af <- c_df %>%
filter(country == 'Afghanistan') %>%
select(id, best, latitude, longitude, side_a, side_b, date_start, date_end) %>%
mutate(date_start = as.Date(date_start), date_end = as.Date(date_end)) %>%
filter(date_start <= '2019-1-31' & date_start >= '2019-1-1')
# Get shapefile -- https://hub.arcgis.com/datasets/2b63527870ef416bacf83bcaf388685f_0/data
afg_sf <- read_sf('afghanistan')
# Beautiful, needed ids and frame for plotly. Frame needs to be in numeric or prob character
# maybe as.Date as.char
afg <-
ggplot(data = c_jan_19_af) +
geom_sf(data = afg_sf, fill = 'gray50', alpha =.1) +
geom_point(data = c_jan_19_af, aes(longitude, latitude, ids = id,
frame = as.character(date_start)),
alpha = .8, color = 'darkred', size = 1) +
labs(x = 'Longitude',
y = 'Latitude',
title = 'Conflict in Afghanistan January 2019')
# Plotly instead?
afg <- ggplotly(afg, width = 500, height = 500)
# Need to re-run to see if this fixes sizing -
afg %>%
animation_opts(1000, easing = "linear", redraw = FALSE)
# Set up data frame
# January only, India only, change to dates, select specific columns
c_jan_19_in <- c_df %>%
filter(country == 'India') %>%
select(id, best, latitude, longitude, side_a, side_b, date_start, date_end) %>%
mutate(date_start = as.Date(date_start), date_end = as.Date(date_end)) %>%
filter(date_start <= '2019-1-31' & date_start >= '2019-1-1')
# Get shapefile -- https://hub.arcgis.com/datasets/2b37b84e67374fb98577c20ef8be6c62_0
india_sf <- read_sf('india')
# Beautiful, needed ids and frame for plotly. Frame needs to be in numeric or prob character
# maybe as.Date as.char
ind <-
ggplot(data = c_jan_19_in) +
geom_sf(data = india_sf, fill = 'gray50', alpha =.1) +
geom_point(data = c_jan_19_in, aes(longitude, latitude, ids = id,
frame = as.character(date_start)),
alpha = .8, color = 'darkred', size = 1) +
labs(x = 'Longitude',
y = 'Latitude',
title = 'Conflict in India January 2019')
# Plotly instead?
ind <- ggplotly(ind, width = 500, height = 500)
# Add opts
ind %>%
animation_opts(1000, easing = "linear", redraw = FALSE)